Explore correlations¶

  • Correlations among year, loan_grade, purpose

    • Plots presented in this section show distinct differences in the distribution of loan grades for different years and loan purposes, but no dramatic patterns.

    • The distribution of loan purposes varies only slightly as a function of year.

  • Correlations involving loan_amnt

    • The largest loans, in the range of \$36k to \$40k, have much better loan grades than smaller loans.

    • Loans in the range of \$5k to \$11k have somewhat better grades than other loans less than \$36k.

    • The year 2018 shows a distinct change in the distribution of loan amounts. Loans amounts that are multiple of \$5k become more frequent, as do loans of \$36k and above.

  • Correlations involving term

    • The loan term is strongly correlated with loan grade, tending to increase with poorer loan grade.

    • Almost all loans below \$10k have a term of 36 months. Loan terms tend to increase with increasing loan amount up to around \$25k, and the mean loan term for larger loans is a little below 50 months.

    • The distribution of loan term shows distinct variations as a function of year and loan purpose, but no strong patterns.

This notebook presents initial exploration of correlations involving selected features in the loan data.

Later notebooks present in-depth analysis of particular features, e.g., int_rate, including correlations involving those features.

In [1]:
import re

import numpy as np
import pandas as pd
import plotly.express as px
from IPython.display import display

import notebook_tools.database as db
from notebook_tools.derived_features import get_year
from notebook_tools.feature_exploration import (
    get_group_sizes,
    get_value_counts,
    style_value_counts,
)
In [2]:
loan_data = db.get_loan_data()
loan_metadata = db.get_loan_metadata()
In [3]:
loan_data = loan_data.assign(
    year=get_year(loan_data, "issue_d"),
    term=loan_data["term"].map(lambda n: str(n) + " months"),
)

Correlations among year, loan_grade, purpose¶

Distributions of individual features¶

In [4]:
year_counts = get_value_counts(loan_data["year"])
display(style_value_counts(year_counts))
  count
year  
2018 495,163
2017 443,447
2016 434,254
2015 420,954
2014 235,551
2013 134,786
2012 53,352
In [5]:
loan_data.groupby(by=["grade", "sub_grade"]).size().to_frame(name="count")
Out[5]:
count
grade sub_grade
A A1 85648
A2 68042
A3 71361
A4 92969
A5 104824
B B1 123437
B2 124476
B3 128488
B4 137166
B5 137455
C C1 143589
C2 128908
C3 127480
C4 125707
C5 115390
D D1 80713
D2 71379
D3 63469
D4 55718
D5 46984
E E1 32673
E2 29122
E3 26026
E4 22199
E5 22162
F F1 13016
F2 8987
F3 7553
F4 5909
F5 5009
G G1 3963
G2 2579
G3 2013
G4 1612
G5 1481
In [6]:
purpose_counts = get_value_counts(loan_data["purpose"])
display(style_value_counts(purpose_counts))
  count
purpose  
debt_consolidation 1,257,717
credit_card 511,384
home_improvement 147,222
other 134,972
major_purchase 48,121
medical 26,732
small_business 22,693
car 22,389
vacation 15,120
moving 14,760
house 13,705
wedding 1,351
renewable_energy 1,339
educational 2
In [7]:
to_plot = get_group_sizes(loan_data, group_by="issue_d")
fig = px.line(
    to_plot,
    x="issue_d",
    y="count",
    markers=True,
    labels={"issue_d": "Loan date", "count": "Number of loans"},
    hover_data={"count": ":.3s"},
    title="Number of loans by date",
)
fig.show()
In [8]:
to_plot = get_group_sizes(loan_data, group_by="year")
fig = px.bar(
    to_plot,
    x="year",
    y="count",
    labels={"year": "Year", "count": "Number of loans"},
    hover_data={"count": ":.3s"},
    title="Number of loans by year",
)
fig.show()
In [9]:
to_plot = get_group_sizes(loan_data, group_by="grade")
fig = px.bar(
    to_plot,
    x="grade",
    y="count",
    labels={"grade": "Loan grade", "count": "Number of loans"},
    hover_data={"count": ":.3s"},
    title="Number of loans by loan grade",
)
fig.show()
In [10]:
to_plot = get_group_sizes(loan_data, group_by=["grade", "sub_grade"])
to_plot["sub_grade"] = to_plot["sub_grade"].str[1]
fig = px.bar(
    to_plot,
    x="grade",
    y="count",
    color="sub_grade",
    labels={
        "grade": "Loan grade",
        "count": "Number of loans",
        "sub_grade": "Sub-grade",
    },
    hover_data={"count": ":.3s"},
    title="Number of loans by loan grade and sub-grade",
)
fig.show()
In [11]:
to_plot = get_group_sizes(loan_data, group_by="purpose").sort_values(
    "count", ascending=False
)

# Save an ordered array of the loan purposes for use in later plotting.
ordered_loan_purposes = list(to_plot["purpose"])

fig = px.bar(
    to_plot,
    x="purpose",
    y="count",
    labels={"purpose": "Loan purpose", "count": "Number of loans"},
    hover_data={"count": ":.3s"},
    title="Number of loans by purpose",
)
fig.show()

Correlations between features¶

In [12]:
to_plot = get_group_sizes(loan_data, group_by=["year", "grade"])
fig = px.histogram(
    to_plot,
    x="year",
    y="count",
    color="grade",
    barnorm="fraction",
    labels={"year": "Year", "count": "Number of loans", "grade": "Grade"},
    title="Distribution of loan grade by year",
    height=400,
)


def clean_up_hovertemplate(trace):
    trace.hovertemplate = trace.hovertemplate.replace("%{y}", "%{y:.3p}").replace(
        "sum of Number of loans (normalized as fraction)", "Percentage"
    )


fig.for_each_trace(clean_up_hovertemplate)
fig.update_yaxes(tickformat=".2p", title_text="Percentage of loans")
fig.show()

For a simpler view of correlations involving the loan grade, map the grades to numbers and calculate the mean grade for different groups.

In [13]:
loan_grade_mapper = {"A": 7, "B": 6, "C": 5, "D": 4, "E": 3, "F": 2, "G": 1}
In [14]:
to_plot = (
    loan_data[["year", "grade"]]
    .assign(grade=lambda df: df["grade"].map(loan_grade_mapper))
    .groupby("year")
    .mean()
    .reset_index()
)
fig = px.bar(
    to_plot,
    x="year",
    y="grade",
    labels={"year": "Year", "grade": "Mean numeric grade"},
    title="Mean numeric grades by year (A=7, B=6, C=5, ..., G=1)",
    hover_data={"grade": ":.2f"},
)
fig.show()
In [15]:
to_plot = get_group_sizes(loan_data, group_by=["year", "purpose"])
fig = px.histogram(
    to_plot,
    x="year",
    y="count",
    color="purpose",
    barnorm="fraction",
    category_orders={"purpose": ordered_loan_purposes},
    labels={"year": "Year", "count": "Number of loans", "purpose": "Purpose"},
    title="Distribution of loan purpose by year",
    height=500,
)


def clean_up_hovertemplate(trace):
    trace.hovertemplate = trace.hovertemplate.replace("%{y}", "%{y:.3p}").replace(
        "sum of Number of loans (normalized as fraction)", "Percentage"
    )


fig.for_each_trace(clean_up_hovertemplate)
fig.update_yaxes(tickformat=".2p", title_text="Percentage of loans")
fig.show()
In [16]:
to_plot = get_group_sizes(loan_data, group_by=["purpose", "grade"])
fig = px.histogram(
    to_plot,
    x="purpose",
    y="count",
    color="grade",
    barnorm="fraction",
    category_orders={
        "purpose": ordered_loan_purposes,
    },
    labels={"purpose": "Loan purpose", "count": "Number of loans", "grade": "Grade"},
    title="Distribution of loan grade by loan purpose",
    height=400,
)


def clean_up_hovertemplate(trace):
    trace.hovertemplate = trace.hovertemplate.replace("%{y}", "%{y:.3p}").replace(
        "sum of Number of loans (normalized as fraction)", "Percentage"
    )


fig.for_each_trace(clean_up_hovertemplate)
fig.update_yaxes(tickformat=",.2p", title_text="Percentage of loans")
fig.show()
In [17]:
to_plot = (
    loan_data[["purpose", "grade"]]
    .assign(grade=lambda df: df["grade"].map(loan_grade_mapper))
    .groupby("purpose")
    .mean()
    .reset_index()
)
fig = px.bar(
    to_plot,
    x="purpose",
    y="grade",
    category_orders={
        "purpose": ordered_loan_purposes,
    },
    labels={"purpose": "Loan purpose", "grade": "Mean numeric grade"},
    title="Mean numeric grades by loan purpose (A=7, B=6, C=5, ..., G=1)",
    hover_data={"grade": ":.2f"},
)
fig.show()

Conclusions:

  • The distribution of loan grades shows distinct variation for different years and loan purposes, but no dramatic patterns.
  • The distribution of loan purposes varies only slightly as a function of year.

Correlations involving loan_amnt¶

For large data sets, the binning of histogram data should be done outside of plotly. The reason is that plotly does binning in JavaScript, and so unbinned data passed to plotly's histogram function becomes part of the javascript code stored with the notebook. For the current data set, this can increase the notebook size on disk by a factor of more than 100.

In [18]:
min = loan_data["loan_amnt"].min()
max = loan_data["loan_amnt"].max()
print(
    'The minimum and maximum values of "loan_amnt" '
    f"are ${min:,} and ${max:,}, respectively."
)
The minimum and maximum values of "loan_amnt" are $1,000.0 and $40,000.0, respectively.
In [19]:
# Define arrays / lists needed for binning the histogram and plotting the bins in
# plotly.  Use $ in place of $ in order to avoid triggering math formatting.
loan_amnt_bins = np.linspace(1e3, 41e3, num=41)
loan_amnt_bin_labels = [f"[${left:d}k - ${left+1:d}k)" for left in range(1, 41)]
loan_amnt_tick_vals = loan_amnt_bin_labels[4::5]
loan_amnt_tick_text = [f"${left:d}k" for left in range(5, 45, 5)]
In [20]:
loan_data["loan_amnt_bin"] = pd.cut(
    loan_data["loan_amnt"],
    bins=loan_amnt_bins,
    labels=loan_amnt_bin_labels,
    right=False,
)
In [21]:
to_plot = get_group_sizes(loan_data, group_by="loan_amnt_bin")
fig = px.bar(
    to_plot,
    x="loan_amnt_bin",
    y="count",
    labels={"count": "Number of loans", "loan_amnt_bin": "Loan amount"},
    title="Distribution of loan amount",
)
hovertemplate = "Loan amount=%{customdata}<br>Number of loans=%{y:.3s}<extra></extra>"
fig.update_traces(customdata=loan_amnt_bin_labels, hovertemplate=hovertemplate)
fig.update_layout(bargap=0)
fig.update_xaxes(
    tickmode="array", tickvals=loan_amnt_tick_vals, ticktext=loan_amnt_tick_text
)
fig.show()
In [22]:
to_plot = get_group_sizes(loan_data, group_by=["loan_amnt_bin", "grade"])
fig = px.histogram(
    to_plot,
    x="loan_amnt_bin",
    y="count",
    color="grade",
    barnorm="fraction",
    labels={
        "loan_amnt_bin": "Loan amount",
        "count": "Number of loans",
        "grade": "Grade",
    },
    title="Distribution of loan grade by loan amount",
)


def clean_up_hovertemplate(trace):
    trace.customdata = loan_amnt_bin_labels
    trace.hovertemplate = (
        trace.hovertemplate.replace("%{x}", "%{customdata}")
        .replace("%{y}", "%{y:.3p}")
        .replace("sum of Number of loans (normalized as fraction)", "Percentage")
    )


fig.for_each_trace(clean_up_hovertemplate)
fig.update_layout(bargap=0)
fig.update_xaxes(
    tickmode="array", tickvals=loan_amnt_tick_vals, ticktext=loan_amnt_tick_text
)
fig.update_yaxes(tickformat=",.2p", title_text="Percentage of loans")
fig.show()
In [23]:
to_plot = (
    loan_data[["loan_amnt_bin", "grade"]]
    .assign(grade=lambda df: df["grade"].map(loan_grade_mapper))
    .groupby("loan_amnt_bin", observed=False)
    .mean()
    .reset_index()
)
fig = px.bar(
    to_plot,
    x="loan_amnt_bin",
    y="grade",
    labels={
        "loan_amnt_bin": "Loan amount",
        "grade": "Mean numeric grade",
    },
    title="Mean numeric grades by loan amount (A=7, B=6, C=5, ..., G=1)",
)
hovertemplate = (
    "Loan amount=%{customdata}<br>Mean numeric grade=%{y:.2f}<extra></extra>"
)
fig.update_traces(customdata=loan_amnt_bin_labels, hovertemplate=hovertemplate)
fig.update_layout(bargap=0)
fig.update_xaxes(
    tickmode="array", tickvals=loan_amnt_tick_vals, ticktext=loan_amnt_tick_text
)
fig.show()
In [24]:
to_plot = get_group_sizes(loan_data, group_by=["loan_amnt_bin", "year"])
fig = px.histogram(
    to_plot,
    x="loan_amnt_bin",
    y="count",
    color="year",
    barnorm="fraction",
    labels={
        "loan_amnt_bin": "Loan amount",
        "count": "Number of loans",
        "year": "Year",
    },
    title="Distribution of loan year by loan amount",
)


def clean_up_hovertemplate(trace):
    trace.customdata = loan_amnt_bin_labels
    trace.hovertemplate = (
        trace.hovertemplate.replace("%{x}", "%{customdata}")
        .replace("%{y}", "%{y:.3p}")
        .replace("sum of Number of loans (normalized as fraction)", "Percentage")
    )


fig.for_each_trace(clean_up_hovertemplate)
fig.update_layout(bargap=0)
fig.update_xaxes(
    tickmode="array", tickvals=loan_amnt_tick_vals, ticktext=loan_amnt_tick_text
)
fig.update_yaxes(tickformat=",.2p", title_text="Percentage of loans")
fig.show()
In [25]:
encoded_year_mapper = {
    "2012": 1,
    "2013": 2,
    "2014": 3,
    "2015": 4,
    "2016": 5,
    "2017": 6,
    "2018": 7,
}
to_plot = (
    loan_data[["loan_amnt_bin", "year"]]
    .assign(year=lambda df: df["year"].map(encoded_year_mapper))
    .groupby("loan_amnt_bin", observed=False)
    .mean()
    .reset_index()
)
fig = px.bar(
    to_plot,
    x="loan_amnt_bin",
    y="year",
    labels={
        "loan_amnt_bin": "Loan amount",
        "year": "Mean encoded year",
    },
    title="Mean encoded year by loan amount (2018=7, 2017=6, 2015=5, ..., 2012=1)",
)
hovertemplate = "Loan amount=%{customdata}<br>Mean encoded year=%{y:.2f}<extra></extra>"
fig.update_traces(customdata=loan_amnt_bin_labels, hovertemplate=hovertemplate)
fig.update_layout(bargap=0)
fig.update_xaxes(
    tickmode="array", tickvals=loan_amnt_tick_vals, ticktext=loan_amnt_tick_text
)
fig.show()
In [26]:
to_plot = get_group_sizes(loan_data, group_by=["loan_amnt_bin", "purpose"])
fig = px.histogram(
    to_plot,
    x="loan_amnt_bin",
    y="count",
    color="purpose",
    barnorm="fraction",
    category_orders={"purpose": ordered_loan_purposes},
    labels={
        "loan_amnt_bin": "Loan amount",
        "count": "Number of loans",
        "purpose": "Purpose",
    },
    title="Distribution of loan purpose by loan amount",
    height=500,
)


def clean_up_hovertemplate(trace):
    trace.customdata = loan_amnt_bin_labels
    trace.hovertemplate = (
        trace.hovertemplate.replace("%{x}", "%{customdata}")
        .replace("%{y}", "%{y:.3p}")
        .replace("sum of Number of loans (normalized as fraction)", "Percentage")
    )


fig.for_each_trace(clean_up_hovertemplate)
fig.update_layout(bargap=0)
fig.update_xaxes(
    tickmode="array", tickvals=loan_amnt_tick_vals, ticktext=loan_amnt_tick_text
)
fig.update_yaxes(tickformat=",.2p", title_text="Percentage of loans")
fig.show()

Conclusions:

  • The largest loans, in the range of \$36k to \$40k, have much better loan grades than smaller loans.
  • Loans in the range of \$5k to \$11k have somewhat better grades than other loans less than \$36k.
  • The year 2018 shows a distinct change in the distribution of loan amounts. Loans amounts that are multiple of \$5k become more frequent, as do loans of \$36k and above.

Correlations involving term¶

In [27]:
term_counts = get_value_counts(loan_data["term"])
display(style_value_counts(term_counts))
  count
term  
36 months 1,577,886
60 months 639,621
In [28]:
to_plot = get_group_sizes(loan_data, group_by="term")
fig = px.bar(
    to_plot,
    x="term",
    y="count",
    labels={"term": "Loan term", "count": "Number of loans"},
    hover_data={"count": ":.3s"},
    title="Number of loans by loan term",
)
fig.show()
In [29]:
to_plot = get_group_sizes(loan_data, group_by=["grade", "term"])
fig = px.histogram(
    to_plot,
    x="grade",
    y="count",
    color="term",
    barnorm="fraction",
    labels={"grade": "Grade", "count": "Number of loans", "term": "Loan term"},
    title="Distribution of loan term by grade",
)


def clean_up_hovertemplate(trace):
    trace.hovertemplate = trace.hovertemplate.replace("%{y}", "%{y:.3p}").replace(
        "sum of Number of loans (normalized as fraction)", "Percentage"
    )


fig.for_each_trace(clean_up_hovertemplate)
fig.update_yaxes(tickformat=".2p", title_text="Percentage of loans")
fig.show()
In [30]:
to_plot = (
    loan_data[["grade", "term"]]
    .assign(
        term=lambda df: df["term"].str.replace("months", "").str.strip().astype("Int64")
    )
    .groupby("grade")
    .mean()
    .reset_index()
)
fig = px.bar(
    to_plot,
    x="grade",
    y="term",
    labels={"grade": "Grade", "term": "Mean loan term"},
    title="Mean loan term by grade",
    hover_data={"term": ":.1f"},
)


def clean_up_hovertemplate(trace):
    trace.hovertemplate = re.sub(r"(%{y.*?})", r"\1 months", trace.hovertemplate)


fig.for_each_trace(clean_up_hovertemplate)
fig.update_yaxes(title_text="Mean loan term (months)")
fig.show()
In [31]:
to_plot = get_group_sizes(loan_data, group_by=["year", "term"])
fig = px.histogram(
    to_plot,
    x="year",
    y="count",
    color="term",
    barnorm="fraction",
    labels={"year": "Year", "count": "Number of loans", "term": "Loan term"},
    title="Distribution of loan term by year",
)


def clean_up_hovertemplate(trace):
    trace.hovertemplate = trace.hovertemplate.replace("%{y}", "%{y:.3p}").replace(
        "sum of Number of loans (normalized as fraction)", "Percentage"
    )


fig.for_each_trace(clean_up_hovertemplate)
fig.update_yaxes(tickformat=".2p", title_text="Percentage of loans")
fig.show()
In [32]:
to_plot = (
    loan_data[["year", "term"]]
    .assign(
        term=lambda df: df["term"].str.replace("months", "").str.strip().astype("Int64")
    )
    .groupby("year")
    .mean()
    .reset_index()
)
fig = px.bar(
    to_plot,
    x="year",
    y="term",
    labels={"year": "Year", "term": "Mean loan term"},
    title="Mean loan term by year",
    hover_data={"term": ":.1f"},
)


def clean_up_hovertemplate(trace):
    trace.hovertemplate = re.sub(r"(%{y.*?})", r"\1 months", trace.hovertemplate)


fig.for_each_trace(clean_up_hovertemplate)
fig.update_yaxes(title_text="Mean loan term (months)")
fig.show()
In [33]:
to_plot = get_group_sizes(loan_data, group_by=["purpose", "term"])
fig = px.histogram(
    to_plot,
    x="purpose",
    y="count",
    color="term",
    barnorm="fraction",
    category_orders={"purpose": ordered_loan_purposes},
    labels={"purpose": "Loan purpose", "count": "Number of loans", "term": "Loan term"},
    title="Distribution of loan term by loan purpose",
)


def clean_up_hovertemplate(trace):
    trace.hovertemplate = trace.hovertemplate.replace("%{y}", "%{y:.3p}").replace(
        "sum of Number of loans (normalized as fraction)", "Percentage"
    )


fig.for_each_trace(clean_up_hovertemplate)
fig.update_yaxes(tickformat=".2p", title_text="Percentage of loans")
fig.show()
In [34]:
to_plot = (
    loan_data[["purpose", "term"]]
    .assign(
        term=lambda df: df["term"].str.replace("months", "").str.strip().astype("Int64")
    )
    .groupby("purpose")
    .mean()
    .reset_index()
)
fig = px.bar(
    to_plot,
    x="purpose",
    y="term",
    category_orders={"purpose": ordered_loan_purposes},
    labels={"purpose": "Loan purpose", "term": "Mean loan term"},
    title="Mean loan term by loan purpose",
    hover_data={"term": ":.1f"},
)


def clean_up_hovertemplate(trace):
    trace.hovertemplate = re.sub(r"(%{y.*?})", r"\1 months", trace.hovertemplate)


fig.for_each_trace(clean_up_hovertemplate)
fig.update_yaxes(title_text="Mean loan term (months)")
fig.show()
In [35]:
to_plot = get_group_sizes(loan_data, group_by=["loan_amnt_bin", "term"])
fig = px.histogram(
    to_plot,
    x="loan_amnt_bin",
    y="count",
    color="term",
    barnorm="fraction",
    labels={
        "loan_amnt_bin": "Loan amount",
        "count": "Number of loans",
        "term": "Loan term",
    },
    title="Distribution of loan term by loan amount",
)


def clean_up_hovertemplate(trace):
    trace.customdata = loan_amnt_bin_labels
    trace.hovertemplate = (
        trace.hovertemplate.replace("%{x}", "%{customdata}")
        .replace("%{y}", "%{y:.3p}")
        .replace("sum of Number of loans (normalized as fraction)", "Percentage")
    )


fig.for_each_trace(clean_up_hovertemplate)
fig.update_layout(bargap=0)
fig.update_xaxes(
    tickmode="array", tickvals=loan_amnt_tick_vals, ticktext=loan_amnt_tick_text
)
fig.update_yaxes(tickformat=",.2p", title_text="Percentage of loans")
fig.show()
In [36]:
to_plot = (
    loan_data[["loan_amnt_bin", "term"]]
    .assign(
        term=lambda df: df["term"].str.replace("months", "").str.strip().astype("Int64")
    )
    .groupby("loan_amnt_bin", observed=False)
    .mean()
    .reset_index()
)
fig = px.bar(
    to_plot,
    x="loan_amnt_bin",
    y="term",
    labels={
        "loan_amnt_bin": "Loan amount",
        "term": "Mean loan term",
    },
    title="Mean loan term by loan amount",
    hover_data={"term": ":.1f"},
)


def clean_up_hovertemplate(trace):
    trace.hovertemplate = re.sub(r"(%{y.*?})", r"\1 months", trace.hovertemplate)


fig.for_each_trace(clean_up_hovertemplate)
fig.update_layout(bargap=0)
fig.update_xaxes(
    tickmode="array", tickvals=loan_amnt_tick_vals, ticktext=loan_amnt_tick_text
)
fig.show()

Conclusions:

  • The loan term is strongly correlated with loan grade, tending to increase with poorer loan grade.

  • Almost all loans below \$10k have a term of 36 months. Loan terms tend to increase with increasing loan amount up to around \$25k, and the mean loan term for larger loans is a little below 50 months.

  • The distribution of loan term shows distinct variations as a function of year and loan purpose, but no strong patterns.